"""
Code to pre-process and clean scraped sections 

First created: 4/21/23
Final edit: 6/1/24

"""


import pandas as pd
import nltk
nltk.download('stopwords')
nltk.download('punkt')
import glob
import re
from nltk.corpus import stopwords  
from nltk.tokenize import word_tokenize  
import time
import os

bill_endings = ['&lt;all&gt;', '(?:Union)? Calendar No.', 'Passed the [House|Senate]', 'Speaker of the House of Representatives[.]', 'Attest:']
months = ['January', 'February', 'March', 'April', 'May', 'June', 'July', 'August', 'September', 'November', 'December']
extra_stopw = ['shall', 'section', 'act', 'secretary', 'subsection', 'may', 'sec', 'states', 'provided', 'paragraph', 'title', 'ii', 'amended', 'made', 'date', 'insterting', 'law', 'agency', 'striking', 'amount', 'activities', 'programs', 'authorized','report', 'subparagraph', '&lt;gr-thn-eq&gt;','january','february','march','april','may','june','july','august','september','october','november','december']
tokens_bottom = ['lt gt', ' lt ', ' gt ', 'hr eh', 'hr ih', 'hr sc','hr enr', 'hr rh', 'hr rs', 'hr eas', 'hr es', 'hr is', 'hr as']

front_list=['Resolved, ','  JOINT RESOLUTION', '  CONCURRENT RESOLUTION"', '  AMENDMENT', '  AMENDMENTS', '  A BILL', '  RESOLUTION','  AN ACT', '  Joint Resolution', '  Concurrent Resolution', '  Amendment', '  Amendments', '  A Bill', '  An Act', '  Resolution']
cleanr = re.compile('<.*?>|&([a-z0-9]+|#[0-9]{1,6}|#x[0-9a-f]{1,6});')
stop_words = stopwords.words('english')
stop_words.extend(extra_stopw)

#Removes line breaks of various types
def rm_breaks(text):
    text = text.replace("\n{2,}", "\n")
    text = text.replace("\n\\s+", "\n")
    text = text.replace("\n","")
    
    return text

#Removes stopwords
def rm_stopwords(text):
    word_tokens = word_tokenize(text) 
    new_text = [word for word in word_tokens if not word in stopwords.words()]
    new_text = [word for word in new_text if len(word)>1]
    new_text2 = (" ").join(new_text)
    
    for word in tokens_bottom:
        new_text2=re.sub(word, "", new_text2)
    new_text2= " ".join(new_text2.split())
    
    return new_text2

def preserve_bill(text):
    
    coderef = re.findall('section [0-9]{1,}\([a-z]\)\([0-9]*\)', text)
    
    for i in coderef:
        space_rm = re.sub(" ","",i)
        pun_rm = re.sub("\(|\)","",space_rm)
        text = text.replace(i,pun_rm)
        
    return text

def rm_sec(text):
    
    start = text[0:20]
    if start[0:3] == 'sec':
        clean_start = re.sub('(sec [0-9]*|sec )','',start)
        text = re.sub(start,clean_start,text)
    
    return text

#Setting working directory as scraped and split sections
os.chdir('/replication/sections/111)

secs = []
for each_file in glob.glob('*.{}'.format('txt')):
    secs.append(each_file)
secs_full = secs

###Add date data to sections from congressional bills project metadata    
meta_data = pd.read_csv(r'/replication/cbp_metadata.csv')
meta_data['action.date.x']= pd.to_datetime(meta_data['action.date.x'], errors = 'coerce')
meta_sub = meta_data[['version', 'action.date.x', 'congress']]
meta_data=[]

meta_sub = (meta_sub[meta_sub['congress'].apply(lambda x: x == 115)]) #CHANGE THIS FOR EACH CONGRESS

meta_sub['bill'] = meta_sub['version'].str.slice(start=6) #Get correct version variable

meta_sub['date'] = pd.to_datetime(meta_sub['action.date.x'], infer_datetime_format=True, utc=True) #Clean action date


#Section cleaning Code
sect_list = []
count =0
start = time.process_time()
for j in secs:
    sect=[]
    txt = open(j, 'r')
    txtContent = txt.read()
    top = re.search("|".join(front_list), txtContent)
    if top!=None:
        match=max(top.span())
        txtContent = txtContent[match:len(txtContent)]
    top = re.search("|".join(front_list), txtContent)
    if top!=None:
        match=max(top.span())
        txtContent = txtContent[match:len(txtContent)]
    bot = re.search("|".join(bill_endings), txtContent)
    if bot!=None:
        match=min(bot.span())
        txtContent = txtContent[0:match] 
    txtContent = re.sub(cleanr, '', txtContent)
    txtContent = txtContent.lower()
    title_break = re.search('\n\n(?!section|sec\.)[a-z].*\n\n', txtContent)
    if title_break!=None:
        title_position = min(title_break.span())
        txtContent = txtContent[0:title_position]
    #If you want original text in there as well
    old = txtContent
    txtContent = rm_breaks(txtContent)
    if txtContent.find('$')!=-1:
        dollar = 1
    else:
        dollar = 0
    txtContent = re.sub("[\,]", "", txtContent)
    txtContent = re.sub('( {1,}percent| {1,}percentage|%)','percentx',txtContent)      
    txtContent = re.sub('(?:(\$[0-9]*|[0-9]*percentx|section [0-9]{1,}\\([a-z]\\)\\([0-9]{1,}\\))|\d+)(?=\D|$)', r"\1", txtContent)
    txtContent = preserve_bill(txtContent)
    txtContent = re.sub('[^A-Za-z0-9$]+', ' ', txtContent)
    #Keeps stopword removal from removing $/% signs
    txtContent = re.sub('\$','dollarx',txtContent) 
    txtContent = rm_stopwords(txtContent)
    #Convert back to $/% signs
    txtContent = re.sub('dollarx','$',txtContent)
    txtContent = re.sub('percentx','%',txtContent)
    txtContent = rm_sec(txtContent)
    bill = j.rsplit('_', 1)
    bill=bill[0]
    sect.append(bill)
    sect.append(j)
    #Add old if you want to
    sect.append(old)
    sect.append(txtContent)
    sect.append(dollar)
    sect_list.append(sect)
    count=count+1
    print(count)

print(time.process_time() - start)

#Create dataframe for cleaned data and add in date for each section
cleandf = pd.DataFrame(sect_list, columns=['bill', 'section', 'original_text', 'clean_text','dollars'])
cleandf['bill']=cleandf['bill'].str.replace('_', "")
#Change misclassified bill in 103rd
#cleandf['bill'] = cleandf['bill'].replace({'103s2650rs': '103hr2650rs'}, regex=True)
#cleandf['section'] = cleandf['section'].replace({'103s2650_rs': '103hr2650_rs'}, regex=True)
cleandf = pd.merge(cleandf, meta_sub, on='bill', how='left')
cleandf = cleandf.sort_values(by='date')
#Check for missing dates
null_data = cleandf[cleandf.isnull().any(axis=1)]
null_data=null_data[['bill']]
null_data=null_data.groupby('bill').count()
null_data = null_data.reset_index()

cleandf =  cleandf[['bill', 'section', 'original_text', 'clean_text','dollars', 'date']] #Export this file for sections that include boilerplate

#Call in boilerplate and remove boilerplate
boilerplate_list = open('/replication/boilerplate.txt')
boilerplate = boilerplate_list.readlines()
boilerplate = [elem.strip('\n') for elem in boilerplate]

cleandf['boilerplate']=0
for j in boilerplate:
    cleandf.loc[cleandf['clean_text'].str.startswith(j, na=False), 'boilerplate'] = 1
        
cleandf.loc[cleandf['clean_text'].str.contains('short title', na=False), 'boilerplate'] = 1 #Remove short title sections

substance_df = cleandf.loc[cleandf['boilerplate'] == 0]
substance_df.to_csv(r'replication/cleaned_sections/111/111_sections_noboilerplate.csv')
